# Load the data in folder data named Dataset_tourism.xlsx)
tourism_data <- readxl::read_xlsx(here("data/Dataset_tourism.xlsx"))1 Tourism EDA - Zurich
1.1 Univariable analysis
1.1.1 Load data
1.1.2 Cleaning
#removing value 'Herkunftsland - Total' in column 'Herkunftsland' as it is just the total
tourism_data <- tourism_data %>% filter(Herkunftsland != "Herkunftsland - Total")
#print unique values in month column
unique(tourism_data$Monat)
#> [1] "Januar" "Februar" "März" "April" "Mai"
#> [6] "Juni" "Juli" "August" "September" "Oktober"
#> [11] "November" "Dezember"
# change ' [1] "Januar" "Februar" "März" "April" "Mai" "Juni" "Juli" "August" "September" "Oktober" "November" "Dezember" into english month'
tourism_data$Monat <- tourism_data$Monat %>% recode_factor(
"Januar" = "January",
"Februar" = "February",
"März" = "March",
"April" = "April",
"Mai" = "May",
"Juni" = "June",
"Juli" = "July",
"August" = "August",
"September" = "September",
"Oktober" = "October",
"November" = "November",
"Dezember" = "December"
)
#add date type column for plotting purposes
tourism_data <- tourism_data %>% mutate(Date = dmy(paste("01", Monat, Jahr)))
#check for NAN
sum(is.na(tourism_data))
#> [1] 51395
#analyse the NAN values, where are they
(tourism_data %>% filter(is.na(value)))
#> # A tibble: 51,395 x 6
#> Herkunftsland Kanton Monat Jahr value Date
#> <chr> <chr> <fct> <chr> <dbl> <date>
#> 1 Malta Schwe~ Janu~ 2005 NA 2005-01-01
#> 2 Zypern Schwe~ Janu~ 2005 NA 2005-01-01
#> 3 Mexiko Schwe~ Janu~ 2005 NA 2005-01-01
#> 4 Übriges Zentralamerika, Karib~ Schwe~ Janu~ 2005 NA 2005-01-01
#> 5 Bahrain Schwe~ Janu~ 2005 NA 2005-01-01
#> 6 Katar Schwe~ Janu~ 2005 NA 2005-01-01
#> 7 Kuwait Schwe~ Janu~ 2005 NA 2005-01-01
#> 8 Australien Schwe~ Janu~ 2005 NA 2005-01-01
#> 9 Neuseeland, Ozeanien Schwe~ Janu~ 2005 NA 2005-01-01
#> 10 Oman Schwe~ Janu~ 2005 NA 2005-01-01
#> # i 51,385 more rows
head(tourism_data)
#> # A tibble: 6 x 6
#> Herkunftsland Kanton Monat Jahr value Date
#> <chr> <chr> <fct> <chr> <dbl> <date>
#> 1 Schweiz Schweiz January 2005 482820 2005-01-01
#> 2 Baltische Staaten Schweiz January 2005 758 2005-01-01
#> 3 Deutschland Schweiz January 2005 135741 2005-01-01
#> 4 Frankreich Schweiz January 2005 34248 2005-01-01
#> 5 Italien Schweiz January 2005 34282 2005-01-01
#> 6 Österreich Schweiz January 2005 9194 2005-01-011.1.2.1 Deal with NAN
1.1.2.1.1 Impute missing values ARIMA
If the missing values are random or if excluding them would result in a loss of valuable information, we might consider imputing them. One common approach is to use statistical models like ARIMA to interpolate missing values based on the patterns observed in the available data.
# #Creating a tsibble with missing values
# data <- tourism_data_zurich_philippines %>%
# as_tsibble(key = c(Kanton, Herkunftsland, Monat, Jahr)) %>%
# select(Date, value) %>%
# fill_gaps()
#
# # Fit an ARIMA model to data with missing values
# model_fit <- data %>%
# model(ARIMA(value))
#
# # Interpolate missing values using the fitted ARIMA model
# filled_data <- model_fit %>%
# interpolate(data)
#
# # Print the data with filled in missing values
# print(filled_data)1.1.3 Country visiting zurich
#filter column 'Kanton' for Zurich
tourism_data_zurich <- tourism_data %>% filter(Kanton == "Zürich")
#check for NAN
sum(is.na(tourism_data_zurich))
#> [1] 1869
#analyse the NAN values, where are they
(tourism_data_zurich %>% filter(is.na(value)))
#> # A tibble: 1,869 x 6
#> Herkunftsland Kanton Monat Jahr value Date
#> <chr> <chr> <fct> <chr> <dbl> <date>
#> 1 Malta Zürich Janu~ 2005 NA 2005-01-01
#> 2 Zypern Zürich Janu~ 2005 NA 2005-01-01
#> 3 Mexiko Zürich Janu~ 2005 NA 2005-01-01
#> 4 Übriges Zentralamerika, Karib~ Zürich Janu~ 2005 NA 2005-01-01
#> 5 Bahrain Zürich Janu~ 2005 NA 2005-01-01
#> 6 Katar Zürich Janu~ 2005 NA 2005-01-01
#> 7 Kuwait Zürich Janu~ 2005 NA 2005-01-01
#> 8 Australien Zürich Janu~ 2005 NA 2005-01-01
#> 9 Neuseeland, Ozeanien Zürich Janu~ 2005 NA 2005-01-01
#> 10 Oman Zürich Janu~ 2005 NA 2005-01-01
#> # i 1,859 more rows
head(tourism_data_zurich)
#> # A tibble: 6 x 6
#> Herkunftsland Kanton Monat Jahr value Date
#> <chr> <chr> <fct> <chr> <dbl> <date>
#> 1 Schweiz Zürich January 2005 41094 2005-01-01
#> 2 Baltische Staaten Zürich January 2005 144 2005-01-01
#> 3 Deutschland Zürich January 2005 22537 2005-01-01
#> 4 Frankreich Zürich January 2005 3870 2005-01-01
#> 5 Italien Zürich January 2005 3828 2005-01-01
#> 6 Österreich Zürich January 2005 3006 2005-01-011.1.3.1 Plot time series
# Preparing the data
#removing value 'Schweiz' in column 'Herkunftsland' as it is just the whole of Switzerland
tourism_data_zurich <- tourism_data_zurich %>% filter(Herkunftsland != "Schweiz")
data <- tourism_data_zurich %>%
filter(!is.na(value)) %>% # Removing rows with NA values in the 'value' column
mutate(Monat = month(Date, label = TRUE, abbr = TRUE), # Extract month from Date
Jahr = year(Date)) %>% # Extract year from Date
group_by(Herkunftsland, Date) %>% # Group by country and date
summarise(Trips = sum(value), .groups = 'drop') # Summing up trips for each country per date
# Plotting
ggplot(data, aes(x = Date, y = Trips, group = Herkunftsland)) +
geom_line(aes(color = Herkunftsland == "Philippinen"), show.legend = FALSE) +
scale_color_manual(values = c("TRUE" = "red", "FALSE" = "grey")) +
labs(title = "Number of Trips from Each Country to Zurich",
x = "Date",
y = "Number of Trips") +
theme_minimal() +
guides(color = guide_legend(title = "Country", override.aes = list(color = c("red"))))# Static ggplot
p <- ggplot(data, aes(x = Date, y = Trips, group = Herkunftsland,
color = Herkunftsland == "Philippinen",
text = paste("Country:", Herkunftsland, "<br>Trips:", Trips))) + # Added text for tooltip
geom_line(show.legend = FALSE) +
scale_color_manual(values = c("TRUE" = "red", "FALSE" = "grey")) +
labs(title = "Number of Trips from Each Country to Zurich",
x = "Date",
y = "Number of Trips") +
theme_minimal()
# Convert to an interactive plotly object
interactive_plot <- ggplotly(p, tooltip = "text")
# Adjust plotly settings
interactive_plot <- interactive_plot %>%
layout(margin = list(l = 60, r = 60, b = 60, t = 80), # Adjust margins
legend = list(orientation = "h", x = 0, xanchor = "left", y = -0.2)) # Adjust legend position
# Display the interactive plot
interactive_plot1.1.4 Filter data for zurich and philipines
1.1.4.1 Pattern
1.1.4.1.1 Decompose
#filter visiting country philipines
tourism_data_zurich_philippines <- tourism_data_zurich %>% filter(Herkunftsland == "Philippinen")
head(tourism_data_zurich_philippines)
#> # A tibble: 6 x 6
#> Herkunftsland Kanton Monat Jahr value Date
#> <chr> <chr> <fct> <chr> <dbl> <date>
#> 1 Philippinen Zürich January 2005 57 2005-01-01
#> 2 Philippinen Zürich February 2005 30 2005-02-01
#> 3 Philippinen Zürich March 2005 46 2005-03-01
#> 4 Philippinen Zürich April 2005 73 2005-04-01
#> 5 Philippinen Zürich May 2005 74 2005-05-01
#> 6 Philippinen Zürich June 2005 73 2005-06-01
# Convert data to a time series object
tourism_ts <- tourism_data_zurich_philippines %>%
arrange(Date) %>%
# Ensure data is complete and monthly
complete(Date = seq.Date(min(Date), max(Date), by = "month")) %>%
replace_na(list(value = 0)) %>% # Replace NA values if there are any
# Create a time series object
with(ts(value, frequency = 12, start = decimal_date(min(Date))))
# Decompose the time series
decomposed <- decompose(tourism_ts)
# Plot the decomposed components
plot(decomposed)1.1.4.1.2 Seasonality
# One chart# several chart per month1.2 Tourism EDA - Zurich with Philipine visitors
#filter tourism_data for Philipine visitors in Zurich and autoplot the time series with phili